{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TASK 2 (2017-2019) - English and French Views\n",
    "\n",
    "\n",
    "import sys\n",
    "import os\n",
    "import logging\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from datetime import datetime\n",
    "from matplotlib import pyplot as plt\n",
    "\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from joblib import dump, load\n",
    "\n",
    "import views\n",
    "from views import Ensemble, Model, Downsampling, Period\n",
    "from views.utils.data import assign_into_df\n",
    "from views.apps.transforms import lib as translib\n",
    "from views.apps.evaluation import lib as evallib, feature_importance as fi\n",
    "from views.apps.model import api\n",
    "from views.apps.extras import extras"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = views.DATASETS[\"cm_africa_imp_0\"]\n",
    "df = dataset.df\n",
    "#print(df)\n",
    "level = \"cm\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_path = \"./models/eval_real/{sub}\"\n",
    "out_paths = {\n",
    "    \"evaluation2\": model_path.format(sub=\"evaluation2\"),\n",
    "    \"features2\": model_path.format(sub=\"features2\")\n",
    "}\n",
    "for k, v in out_paths.items():\n",
    "    if not os.path.isdir(v):\n",
    "        os.makedirs(v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import our data\n",
    "wiki = pd.read_csv(r'/home/default/Dokumente/TRINITY/comp/prediction-project/data/Wiki_Data_Final_2021.csv')\n",
    "multi = wiki.set_index(['month_id', 'country_id'])\n",
    "multi = multi.sort_index(level=0)\n",
    "\n",
    "df = df.merge(multi, on=['month_id', 'country_id'], how='left')\n",
    "df = df.fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Partition Dataset in Training, Test and Calibration set\n",
    "period_calib = api.Period(\n",
    "   name=\"calib\", \n",
    "   train_start=121,   # 1990-01\n",
    "   train_end=408,     # 2013.12\n",
    "   predict_start=409, # 2014.01\n",
    "   predict_end=443,   # 2016.11\n",
    ")\n",
    "\n",
    "\n",
    "period_test = api.Period(\n",
    "   name=\"test\", \n",
    "   train_start=121,   # 1990-01\n",
    "   train_end=443,     # 2016.11\n",
    "   predict_start=445, # 2017.01\n",
    "   predict_end=480,   # 2019.12\n",
    ")\n",
    "\n",
    "\n",
    "periods = [period_calib, period_test]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The steps to train, predict and evaluate for.\n",
    "steps = [2,3,4,5,6,7]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Subset \"most important\" features from the ViEWS-dataset and Wikipedia variables\n",
    "\n",
    "cols_features = df[['ged_best_sb','wdi_vc_btl_deth', 'reign_precip','reign_prev_conflict','reign_tenure_months',\n",
    "           'reign_irregular','reign_lastelection','reign_loss','reign_pctile_risk','reign_couprisk',\n",
    "          'ged_count_sb','ged_best_os','ged_count_os','wdi_eg_use_pcap_kg_oe','ged_best_ns',\n",
    "           'vdem_v2x_accountability', 'wdi_nv_srv_totl_zs','wdi_sp_pop_totl','wdi_sl_tlf_totl_fe_zs',\n",
    "          'wdi_sm_pop_totl_zs', 'wdi_sm_pop_refg_or', 'wdi_dt_oda_odat_pc_zs', 'ged_count_ns', \n",
    "           'reign_gov_dominant_party', 'reign_age', 'vdem_v2xpe_exlpol', 'wdi_ag_lnd_frst_k2', \n",
    "           'wdi_bg_gsr_nfsv_gd_zs', 'wdi_st_int_rcpt_xp_zs', 'vdem_v2x_clpol', 'vdem_v2x_genpp',\n",
    "                   'viewsFrench', 'viewsEnglish']]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Specify number of estimators in Random Forest\n",
    "n_estimators = 200"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train Model\n",
    "\n",
    "task2_delta = api.Model(\n",
    "    name=\"task2_delta\",                \n",
    "    col_outcome=\"ln_ged_best_sb\",    \n",
    "    cols_features=cols_features,     \n",
    "    steps=steps,                     \n",
    "    outcome_type=\"real\",             \n",
    "    periods=periods,                 \n",
    "    estimator=RandomForestRegressor( \n",
    "        n_estimators=n_estimators,\n",
    "        criterion=\"mse\",\n",
    "        n_jobs=-1,\n",
    "    ),\n",
    "    delta_outcome=True,            \n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "models = [task2_delta]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 15min 32s, sys: 15.5 s, total: 15min 48s\n",
      "Wall time: 15min 34s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# Train all models\n",
    "for model in models:\n",
    "    model.fit_estimators(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Store predictions and calibrated predictions for all models in our dataframe\n",
    "for model in models:\n",
    "    df_predictions = model.predict(df)\n",
    "    df = assign_into_df(df, df_predictions)\n",
    "    df_predictions = model.predict_calibrated(\n",
    "        df=df,\n",
    "        period_calib = period_calib,\n",
    "        period_test = period_test\n",
    "    )\n",
    "    df = assign_into_df(df, df_predictions)\n",
    "  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Evaluate all models. Scores are stored in the model object\n",
    "for model in models:\n",
    "    model.evaluate(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wrote scores table to ./models/eval_real/evaluation2/task2_delta_cm_uncalibrated_scores.tex.\n"
     ]
    }
   ],
   "source": [
    "# Select the test partition and evaluate performance\n",
    "\n",
    "partition = \"test\"\n",
    "\n",
    "for model in models:\n",
    "    for calib in [\"uncalibrated\"]:\n",
    "        scores = {\n",
    "            \"Step\":[], \n",
    "            \"MSE\":[]\n",
    "        }\n",
    "        if model.delta_outcome:\n",
    "            scores.update({\"TADDA\":[]}) \n",
    "            \n",
    "        for key, value in model.scores[partition].items():\n",
    "            if key != \"sc\":\n",
    "                scores[\"Step\"].append(key)\n",
    "                scores[\"MSE\"].append(value[calib][\"mse\"])\n",
    "                if model.delta_outcome:\n",
    "                    scores[\"TADDA\"].append(value[calib][\"tadda_score\"])\n",
    "\n",
    "        out = pd.DataFrame(scores)\n",
    "        tex = out.to_latex(index=False)\n",
    "\n",
    "        now = datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")\n",
    "        meta = f\"\"\"\n",
    "        %Output created by wb_models.ipynb.\n",
    "        %Evaluation of {model.col_outcome} per step.\n",
    "        %Run on selected {model.name} features at {level} level.\n",
    "        %Produced on {now}, written to {out_paths[\"evaluation2\"]}.\n",
    "        \\\\\n",
    "        \"\"\"\n",
    "        tex = meta + tex\n",
    "        path_out = os.path.join(\n",
    "            out_paths[\"evaluation2\"], \n",
    "            f\"{model.name}_{level}_{calib}_scores.tex\"\n",
    "        )\n",
    "#        with open(path_out, \"w\") as f:\n",
    " #           f.write(tex)\n",
    "  #      print(f\"Wrote scores table to {path_out}.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   Step       MSE     TADDA\n",
      "0     2  0.709900  0.552584\n",
      "1     3  0.772212  0.567361\n",
      "2     4  0.702951  0.524821\n",
      "3     5  0.811074  0.575887\n",
      "4     6  0.774632  0.573834\n",
      "5     7  0.817280  0.582026\n"
     ]
    }
   ],
   "source": [
    "print(out)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
